***** Anomaly Detection using FbProphet *****¶import datetime
import pandas as pd
import requests
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import plotly.express as px
import numpy as np
from fbprophet import Prophet
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from fancyimpute import KNN
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
mpl.rcParams['figure.figsize'] = (10,8)
mpl.rcParams['axes.grid'] = False
print("Libraries imported succesfully")
df_ads = pd.read_csv('ads_challenge.csv')
display(df_ads)
# showing data types of the features
df_ads.dtypes
# converting data datatype to datetime
df_ads.Date = pd.to_datetime(df_ads['Date'])
df_ads.dtypes
# showing information of the data type, You can also use df.summary() for the detailed summary.
df_ads.info()
# Shape of the dataset
df_ads.shape
# Showing Missing Values
df_ads.isnull().sum()/len(df_ads)*100
# converting object data type to float
df_ads['ad_type1_impressions'] = df_ads['ad_type1_impressions'].str.replace(',','').astype('float')
df_ads['ad_type2_impressions'] = df_ads['ad_type2_impressions'].str.replace(',','').astype('float')
df_ads['ad_type1_CTR'] = df_ads['ad_type1_CTR'].str.replace('%','').astype('float') * 10**(-2)
df_ads['ad_type2_CTR'] = df_ads['ad_type2_CTR'].str.replace('%','').astype('float')* 10**(-2)
df_ads['ad_type2_videos_completed'] = df_ads['ad_type2_videos_completed'].str.replace('%','').astype('float')* 10**(-2)
df_ads.head()
Creating a dataframe with the important features selected. For now I am selecting impressions as well as type 2 videos completed. We can omit them as their importance is not significant but this decision can be taken based on doman expertise knowledge to improve the performance further. For now I am keeping these variables.
ctr_video = df_ads[['Date','Country','ad_type2_impressions','ad_type2_videos_completed','ad_type1_CTR','ad_type2_CTR']]
# Daily CTR Trend of United States
fig = px.line(df_ads[df_ads.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type1_CTR','ad_type1_impressions'],title = 'Type-1 CTR')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
As you can see that the data is entirely missing from Jan 9 till Feb 14th for both impressions and CTR for ads group-1 There can be multiple reasons for it
1) Data was not gathered/collected or measured due to some technical issue.
2) There was no data - no ads were shown during that period.
In order to solve the above cases , I am making as assumption that due to covid start - no ads was shown during that period for group 1 therefore we don't have datapoints avaialble for that period of time. There are multiple ways to deal with it .for the sake of simplicity for now, I am replacing these values with 0.
# Replacing null value with 0
ctr_video['ad_type1_CTR'] = ctr_video['ad_type1_CTR'].fillna(0)
# Deleting(dropping) rows where impressions and videos completed are null and CTR is null
ctr_video.dropna(subset=['Date','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR'], inplace = True, how = 'all')
ctr_video[ctr_video.ad_type2_CTR.isna()]
# setting Date as the Index
ctr_video = ctr_video.set_index('Date')
# Finding numerical and categorical features
num_features = ctr_video.select_dtypes(include=['int64', 'float64']).columns
cat_features = ctr_video.select_dtypes(include=['object','category']).columns
# creating a dictionary for ordinal encoding -- > Doing ordinal encoding before KNN for object datatype
ordinal_enc_dict = {}
for col_name in ctr_video[cat_features]:
# Create Ordinal encoder for col
ordinal_enc_dict[col_name] = OrdinalEncoder()
col = ctr_video[col_name]
# Select non-null values of col
col_not_null = col[col.notnull()]
reshaped_vals = col_not_null.values.reshape(-1, 1)
encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
# Store the values to non-null values of the column
ctr_video.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
# KNN imputer for imputin missing values using KNN
KNN_imputer = KNN()
# Impute the DataFrame
ctr_video.iloc[:, :] = KNN_imputer.fit_transform(ctr_video)
ctr_video.head()
ctr_video_scaled = ctr_video.copy()
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
scaler = MinMaxScaler()
numeric_features = ['ad_type2_impressions','ad_type2_videos_completed','ad_type1_CTR','ad_type2_CTR']
# numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
# df2 = pd.DataFrame(scaler.fit_transform(df),
# columns=['ad_type1_impressions','ad_type1_CTR','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR'],
# index = ['Date'])
ctr_video_scaled[numeric_features] = scaler.fit_transform(ctr_video_scaled[numeric_features])
ctr_video_scaled.head()
# Loop over the column names
for col_name in cat_features:
# Reshape the data
reshaped = ctr_video_scaled[col_name].values.reshape(-1, 1)
# Perform inverse transform of the ordinally encoded columns
ctr_video_scaled[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)
ctr_video_scaled.head()
# Loop over the column names
for col_name in cat_features:
# Reshape the data
reshaped = ctr_video[col_name].values.reshape(-1, 1)
# Perform inverse transform of the ordinally encoded columns
ctr_video[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)
ctr_video.head()
fig = px.line(ctr_video_scaled[ctr_video_scaled.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type2_impressions','ad_type2_CTR'],title = 'Impressions Vs Click Through Rate')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
fig = px.line(ctr_video_scaled[ctr_video_scaled.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type2_impressions','ad_type2_CTR'],title = 'Impressions Vs Click Through Rate')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
fig = px.line(ctr_video_scaled[ctr_video_scaled.Country=="India"].reset_index(), x = 'Date',y = ['ad_type2_CTR','ad_type2_impressions'],title = 'Impressions Vs Click Through Rate')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
ctr_video_scaled.query("Country=='United States'")[['ad_type2_impressions','ad_type2_CTR','ad_type2_videos_completed']].plot()
# Unique country names list
ctr_video['Country'].unique()
# scaler.inverse_transform(ctr_video)
ctr_video.head()
# Renaming target column as 'y'
ctr_video.rename(columns={'ad_type2_CTR': 'y'},inplace = True)
# Renaming data index to 'ds'
ctr_video.index.names = ['ds']
# Since Namibia and Guam and has less than 3 training values so it is not possible to model them using fbpropeht -
# As this algorithm requires minimum two datapoints to be trained
# Grouping the data at Country level so that we can run model for each country individually
country_vid_ctr = ctr_video[(ctr_video.Country != 'Namibia') & (ctr_video.Country != 'Guam')].reset_index().groupby('Country')
country_vid_ctr.head()
# Displaying data
country_vid_ctr.head()
# Building model
target = pd.DataFrame()
# Running the loop for each country and creating an individual series and running the model accordingly
for country in country_vid_ctr.groups:
group = country_vid_ctr.get_group(country)
# Defining model
model = Prophet(interval_width=0.95)
# Adding Extra Regressor in the model
model.add_regressor('ad_type2_impressions')
model.add_regressor('ad_type2_videos_completed')
model.add_regressor('ad_type1_CTR')
# Dividing the model in Training and Testing
train_size = int(len(group) *0.8)
train = group[:train_size]
test_size = int(len(group)) - train_size
# Fitting the model
model.fit(train)
# Creating Future Dataframe
future = model.make_future_dataframe(periods=test_size)
future['ad_type2_impressions']=list(group["ad_type2_impressions"])
future['ad_type2_videos_completed']=list(group["ad_type2_videos_completed"])
future['ad_type1_CTR']=list(group["ad_type1_CTR"])
# Predicting future Dataframe
forecast = model.predict(future)
# Plotting the result
model.plot(forecast)
# Renaming column names based on their Country
forecast = forecast.rename(columns={'yhat':'yhat_'+country, 'yhat_lower': 'yhat_lower_'+country, 'yhat_upper': 'yhat_upper_'+country})
forecast['y_'+country]=list(group["y"])
# Calculating Error in predicted value by subtarcting the prediction fromt he original
forecast['error_'+country] = forecast['y_'+country] - forecast['yhat_'+country]
# Calculating the uncertainity in the model by subtracting lower bound from the Upper Bound ( Confidence Interval)
forecast['uncertainity_'+country] = forecast['yhat_upper_'+country] - forecast['yhat_lower_'+country]
# Finding anomalies based on uncertainity - categorizing values which are 1.5 times of uncertainity as an anomaly
forecast['anomaly_'+country] = forecast.apply(lambda x: 'Yes' if (np.abs(x['error_'+country]) > 1.5*x['uncertainity_'+country]) else 'No', axis=1)
# Saving the results in the target dataframe
target = pd.merge(target, forecast[['yhat_'+country,'yhat_lower_'+country,'yhat_upper_'+country,'y_'+country,'ds','error_'+country, 'uncertainity_'+country, 'anomaly_'+country]].set_index('ds'), how='outer',
left_index=True, right_index=True)
display(target)
target.columns
target = target.fillna(0)
# determining Severity of the anomaly by categorizing it in multiple categories 3 being the most severe.
# Function to map the severity level based on the devaition from the standard mean
# Compute the rolling mean and rolling standard deviation(window is a week).
# Classify data with an error of 1.5,1.75 and 2 standard deviations as limits for low,medium and high anomalies.
# (5% of data point would be identified anomalies based on this property)
def anomaly_detection(target, win):
for country in country_vid_ctr.groups:
# target.fillna(0,inplace = True)
target['percentage_change_'+country] = ((target['y_'+country] - target['yhat_'+country]) / target['y_'+country]) * 100
target['meanval_'+country] = target['error_'+country].rolling(window=win).mean()
target['deviation_'+country] = target['error_'+country].rolling(window=win).std()
target['-3s_'+country] = target['meanval_'+country] - (2 * target['deviation_'+country])
target['3s_'+country] = target['meanval_'+country] + (2 * target['deviation_'+country])
target['-2s_'+country] = target['meanval_'+country] - (1.75 * target['deviation_'+country])
target['2s_'+country] = target['meanval_'+country] + (1.75 * target['deviation_'+country])
target['-1s_'+country] = target['meanval_'+country] - (1.5 * target['deviation_'+country])
target['1s_'+country] = target['meanval_'+country] + (1.5 * target['deviation_'+country])
cut_list = target[['error_'+country, '-3s_'+country, '-2s_'+country, '-1s_'+country, 'meanval_'+country, '1s_'+country, '2s_'+country, '3s_'+country]]
cut_values = cut_list.values
cut_sort = np.sort(cut_values)
target['impact_'+country] = [(lambda x: np.where(cut_sort == target['error_'+country][x])[1][0])(x) for x in
range(len(target['error_'+country]))]
severity = {0: 3, 1: 2, 2: 1, 3: 0, 4: 0, 5: 1, 6: 2, 7: 3}
region = {0: "NEGATIVE", 1: "NEGATIVE", 2: "NEGATIVE", 3: "NEGATIVE", 4: "POSITIVE", 5: "POSITIVE", 6: "POSITIVE",
7: "POSITIVE"}
target['color_'+country] = target['impact_'+country].map(severity)
target['region_'+country] = target['impact_'+country].map(region)
target['anomaly_points_'+country] = np.where(target['color_'+country] == 3, target['error_'+country], np.nan)
return target
anomaly_severity = anomaly_detection(target, 7)
# anomaly_severity
(0.145270 - 0.156967)*100/0.145270
target[['impact_India','color_India','anomaly_points_India','y_India','yhat_India','percentage_change_India']]
country_filter = 'United States'
def plot_anomaly(df,metric_name):
dates = df.ds
bool_array = (abs(df['anomaly_points_'+country_filter]) > 0)
actuals = df["y_"+country_filter][-len(bool_array):]
anomaly_points = bool_array * actuals
anomaly_points[anomaly_points == 0] = np.nan
color_map= {0: "aliceblue", 1: "yellow", 2: "orange", 3: "red"}
table = go.Table(
domain=dict(x=[0, 1],
y=[0, 0.3]),
columnwidth=[1, 2 ],
header = dict(height = 20,
values = [['<b>Date</b>'],['<b>Actual Values </b>'],
['<b>Predicted</b>'], ['<b>% Difference</b>'],['<b>Severity (0-3)</b>']],
font = dict(color=['rgb(45, 45, 45)'] * 5, size=14),
fill = dict(color='#d562be')),
cells = dict(values = [df.round(3)[k].tolist() for k in ['ds', 'y_'+country_filter, 'yhat_'+country_filter,
'percentage_change_'+country_filter,'color_'+country_filter]],
line = dict(color='#506784'),
align = ['center'] * 5,
font = dict(color=['rgb(40, 40, 40)'] * 5, size=12),
suffix=[None] + [''] + [''] + ['%'] + [''],
height = 27,
fill=dict(color=
[df['color_'+country_filter].map(color_map)],
)
))
anomalies = go.Scatter(name="Anomaly",
x=dates,
xaxis='x1',
yaxis='y1',
y=df['anomaly_points_'+country_filter],
mode='markers',
marker = dict(color ='red',
size = 11,line = dict(
color = "red",
width = 2)))
upper_bound = go.Scatter(hoverinfo="skip",
x=dates,
showlegend =False,
xaxis='x1',
yaxis='y1',
y=df['3s_'+country_filter],
marker=dict(color="#444"),
line=dict(
color=('rgb(23, 96, 167)'),
width=2,
dash='dash'),
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty')
lower_bound = go.Scatter(name='Confidence Interval',
x=dates,
xaxis='x1',
yaxis='y1',
y=df['-3s_'+country_filter],
marker=dict(color="#444"),
line=dict(
color=('rgb(23, 96, 167)'),
width=2,
dash='dash'),
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty')
Actuals = go.Scatter(name= 'Actuals',
x= dates,
y= df['y_'+country_filter],
xaxis='x2', yaxis='y2',
mode='lines',
marker=dict(size=12,
line=dict(width=1),
color="blue"))
Predicted = go.Scatter(name= 'Predicted',
x= dates,
y= df['yhat_'+country_filter],
xaxis='x2', yaxis='y2',
mode='lines',
marker=dict(size=12,
line=dict(width=1),
color="orange"))
# create plot for error...
Error = go.Scatter(name="Error",
x=dates, y=df['error_'+country_filter],
xaxis='x1',
yaxis='y1',
mode='lines',
marker=dict(size=12,
line=dict(width=1),
color="red"),
text="Error")
anomalies_map = go.Scatter(name = "anomaly actual",
showlegend=False,
x=dates,
y=anomaly_points,
mode='markers',
xaxis='x2',
yaxis='y2',
marker = dict(color ="red",
size = 11,
line = dict(
color = "red",
width = 2)))
Moving_average = go.Scatter(name="Moving Average",
x=dates,
y=df['meanval_'+country_filter],
mode='lines',
xaxis='x1',
yaxis='y1',
marker=dict(size=12,
line=dict(width=1),
color="green"),
text="Moving average")
axis=dict(
showline=True,
zeroline=False,
showgrid=True,
mirror=True,
ticklen=4,
gridcolor='#ffffff',
tickfont=dict(size=10))
layout = dict(
width=1000,
height=865,
autosize=False,
title= metric_name,
margin = dict(t=75),
showlegend=True,
xaxis1=dict(axis, **dict(domain=[0, 1], anchor='y1', showticklabels=True)),
xaxis2=dict(axis, **dict(domain=[0, 1], anchor='y2', showticklabels=True)),
yaxis1=dict(axis, **dict(domain=[2 * 0.21 + 0.20 + 0.09, 1], anchor='x1', hoverformat='.2f')),
yaxis2=dict(axis, **dict(domain=[0.21 + 0.12, 2 * 0.31 + 0.02], anchor='x2', hoverformat='.2f')))
fig = go.Figure(data = [table,anomalies,anomalies_map,
upper_bound,lower_bound,Actuals,Predicted,
Moving_average,Error], layout = layout)
iplot(fig)
pyplot.show()
anomaly_severity = anomaly_detection(target, 7)
anomaly_severity.reset_index(inplace=True)
# Since the rolling window is seven days so I am plotting the data after seven days to avoid false anomalies
end_date = anomaly_severity.ds.min() + datetime.timedelta(days=7)
plot_anomaly(anomaly_severity[anomaly_severity.ds >= end_date],"Anomaly of a Country")